reticulate::use_python("/anaconda3/bin/python")
library(readr)
library(ggplot2)
library(dplyr)
library(methods)
library(stringi)
library(keras)

Introduction:

For my final project I wanted to use photos that I had taken to identify whether or not the photo taken was of a individual person, and couple photo, or a group photo. I sifted through years of photos and tried to choose those photos that were best representative of a solo, couple, or group photo without having people in the background, however, in order to have enough pictures for this project I had to include many couple and solo photos where there were people in the background that may potentially cause confusion with the group classification.

First I will import the Resnet model and grab the second to last layer of it so that we may use it to predict.

resnet50 <- application_resnet50(weights = 'imagenet', include_top = TRUE)

model_avg_pool <- keras_model(inputs = resnet50$input,
                              outputs = get_layer(resnet50, 'avg_pool')$output)

Lets take a look at one of each of the photos.

image_path <- "/Users/aidanbond/Desktop/STAT389/Final Project/StatProjectPhotos/Solo/SoloPhoto_44.JPG"
solo <- image_load(image_path, target_size = c(224,224))
solo <- image_to_array(solo)
solo <- array_reshape(solo, c(1, dim(solo)))
dim(solo)
## [1]   1 224 224   3
par(mar = rep(0, 4L))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE, type = "n", asp=1)
rasterImage(solo[1,,,] / 255,0,0,1,1)

image_path <- "/Users/aidanbond/Desktop/STAT389/Final Project/StatProjectPhotos/Couple/CouplePhoto_26.JPG"
couple <- image_load(image_path, target_size = c(224,224))
couple <- image_to_array(couple)
couple <- array_reshape(couple, c(1, dim(couple)))
dim(couple)
## [1]   1 224 224   3
par(mar = rep(0, 4L))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE, type = "n", asp=1)
rasterImage(couple[1,,,] / 255,0,0,1,1)

image_path <- "/Users/aidanbond/Desktop/STAT389/Final Project/StatProjectPhotos/Group/GroupPhoto_47.JPG"
group <- image_load(image_path, target_size = c(224,224))
group <- image_to_array(group)
group <- array_reshape(group, c(1, dim(group)))
dim(group)
## [1]   1 224 224   3
par(mar = rep(0, 4L))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE, type = "n", asp=1)
rasterImage(group[1,,,] / 255,0,0,1,1)

For some reason it seems that when presenting the photos they are flipped from vertical to horizontal randomly. Perhaps even though these images are displayed and recognized as vertical by my camera and computer, when R reads them it assumes the camera was always perfectly level and not turned 90 degrees.

First I will run a neural network to see how well it can predict the dataset.

image_data <- read_csv("/Users/aidanbond/Desktop/STAT389/Final Project/my-image-data.csv")
## Parsed with column specification:
## cols(
##   obs_id = col_character(),
##   train_id = col_character(),
##   class = col_integer(),
##   class_name = col_character(),
##   path_to_image = col_character()
## )
X <- read_rds("my-image-embed.rds")
X_train <- X[image_data$train_id == "train",]
y_train <- to_categorical(image_data$class[image_data$train_id == "train"])

model <- keras_model_sequential()
model %>%
  layer_dense(units = 512, input_shape = ncol(X_train)) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = 512) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.5) %>%

  layer_dense(units = ncol(y_train)) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_rmsprop(lr = 0.001 / 3),
                  metrics = c('accuracy'))

history <- model %>%
  fit(X_train, y_train, epochs = 10)

It seems that most of the time this neural network has a validation rate between 60%-70% which is about double the 33% rate of random guessing.

y_pred <- predict_classes(model, X)
tapply(image_data$class == y_pred, image_data$train_id, mean)
##     train     valid 
## 0.9084507 0.6421053

Next I will do transfer learning, first by importing the image directory which contains the three folders of solo, couple, and group photos. I should note, that since the photos will be converted to a 224x224 image, I chose to use only photos that were horizontal so that they would all be smushed horizontally, rather than smushed vertically or horizontally. In the end I had to dowload photos off of google images in order to meet the solo and couple photo requirements of 75 photos.

input_dir <- "/Users/aidanbond/Desktop/STAT389/Final Project/StatProjectPhotos"

image_paths <- dir(input_dir, recursive = TRUE)
ext <- stri_match(image_paths, regex = "\\.([A-Za-z]+$)")[,2]
image_paths <- image_paths[stri_trans_tolower(ext) %in% c("JPG", "jpg", "jpeg")]
class_vector <- dirname(image_paths)
class_names <- levels(factor(class_vector))

n <- length(class_vector)
Z <- array(0, dim = c(n, 224, 224, 3))
y <- as.numeric(factor(class_vector)) - 1L
for (i in seq_len(n))
{
  pt <- file.path(input_dir, image_paths[i])
  image <- image_to_array(image_load(pt, target_size = c(224,224)))
  Z[i,,,] <- array_reshape(image, c(1, dim(image)))
}

Since we are using stochastic gradient descent it is important that we randomize the sequence of the data.

set.seed(1)
index <- sample(seq_len(nrow(Z)))
Z <- Z[index,,,]
y <- y[index]

Now we must embed the data and use the embedded data to construct a validation and training matrix.

X <- predict(model_avg_pool, x = imagenet_preprocess_input(Z), verbose = TRUE)
dim(X)
## [1]  237 2048

I chose to go with a 70-30 train-validation split, because it produce a 70% validation rate much more consistently than other splits after numerous tests of re-randomizing the order of the data and the splits. Of course there is some luck involved as the success of this split varied greatly from ~60% and up to even 80%, however, much of that seemed to be resultant from how easy it was to predict the 30% of photos that weren’t used in the training.

train_id <- sample(c("train", "valid"), nrow(X), TRUE, prob = c(0.7, 0.3))

X_train <- X[train_id == "train",]                  
y_train <- to_categorical(y[train_id == "train"])
model <- keras_model_sequential()
model %>%
  layer_dense(units = 512, input_shape = ncol(X_train)) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.25) %>%

  layer_dense(512) %>%
  layer_activation(activation = "relu") %>%
  layer_dropout(rate = 0.25) %>%

  layer_dense(units = ncol(y_train)) %>%
  layer_activation(activation = "softmax")

model %>% compile(loss = 'categorical_crossentropy',
                  optimizer = optimizer_rmsprop(lr = 0.001 / 3),
                  metrics = c('accuracy'))

history <- model %>%
  fit(X_train, y_train, epochs = 10)
plot(history)

y_pred <- predict_classes(model, X)
tapply(y == y_pred, train_id, mean)
##     train     valid 
## 0.9882353 0.6567164

Here is the confusion matrix, and as you can see it appears that many couple photos were confused for being group photos or solo photos. The group photos also had issues being classified as couple photos, most likely due to cases with both where there were extras in the background.

table(value = class_names[y + 1L], prediction = class_names[y_pred + 1L], train_id)
## , , train_id = train
## 
##         prediction
## value    Couple Group Solo
##   Couple     53     2    0
##   Group       0    68    0
##   Solo        0     0   47
## 
## , , train_id = valid
## 
##         prediction
## value    Couple Group Solo
##   Couple      3    11    6
##   Group       2    18    0
##   Solo        1     3   23

Next we will get the probability values, so that will be able to see what pictures best represented the models version of “solo” “couple” and “group” photos

y_probs <- predict(model, X)

It seems that for the most part, a close up photo with a blank background, or no one else in the background, had the highest probability of being a solo photo.

type <- "Solo"

id <- order(y_probs[,which(class_names == type)], decreasing = TRUE)[1:12]

par(mfrow = c(3, 4))
for (i in id) {
  par(mar = rep(0, 4L))
  plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n", asp=1)
  rasterImage(Z[i,,,] /255,0,0,1,1)
  text(0.5, 0.1, label = class_names[y[i] + 1L], col = "red", cex=2)
}

Here we can see some examples of the easiest couple photos to identify, and as you can see, there are often little or no people in the background, and if there are people in the background they are typically far away.

type <- "Couple"

id <- order(y_probs[,which(class_names == type)], decreasing = TRUE)[1:12]

par(mfrow = c(3, 4))
for (i in id) {
  par(mar = rep(0, 4L))
  plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n", asp=1)
  rasterImage(Z[i,,,] /255,0,0,1,1)
  text(0.5, 0.1, label = class_names[y[i] + 1L], col = "red", cex=3)
}

Here we can again see a similar trend, in that the photos which were most obviously determined to be group photos had several many people lined up neatly, looking into the camera, with no people in the backround.

type <- "Group"

id <- order(y_probs[,which(class_names == type)], decreasing = TRUE)[1:12]

par(mfrow = c(3, 4))
for (i in id) {
  par(mar = rep(0, 4L))
  plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n", asp=1)
  rasterImage(Z[i,,,] /255,0,0,1,1)
  text(0.5, 0.1, label = class_names[y[i] + 1L], col = "red", cex=3)
}

Here we can wee examples of photos that were not classified correctly. In many of the photos are people wandering in the background, and they appear to be similar in size to the people in the photo. Some of the couple misclassifications may have been confused with solo photos due to the uniform background.

par(mfrow = c(2, 3))
id <- which(y_pred != y)
for (i in id) {
  par(mar = rep(0, 4L))
  plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n")
  rasterImage(Z[i,,,] /255,0,0,1,1)
  text(0.5, 0.1, label = class_names[y[i] + 1L], col = "red", cex=4)
}

Now when we visualize the embedding it appears that the solo and group photos are clearly separated from left to right, however, the couple photos are more spread out. The couple photos that are in the main grouping of the solo photos are most likely those that have a blank background, and the couple photos that are in the main grouping of group photos are most likely those that have people wandering in the background.

pca <- as_tibble(prcomp(X)$x[,1:2])
pca$y <- class_names[y + 1L]

ggplot(pca, aes(PC1, PC2)) +
  geom_point(aes(color = y), alpha = 0.3, size = 5) +
  labs(x = "", y = "", color = "class") +
  theme_minimal()